iT邦幫忙

2024 iThome 鐵人賽

DAY 21
0
AI/ ML & Data

AI 學習紀錄系列 第 21

Day 21: Stable Cascade 測試

  • 分享至 

  • xImage
  •  

接續昨天,batch_size 可以控制一次產圖的數量,這在 ComfyUI 中也可以在 Empty Latent Image 或 StableCascade_EmptyLatentImage 中調,可以找一下試試。
extras.sampling_configs 中的 shift 可能是跟處理雜訊有關,待確認。timesteps 就是 steps 。
t_start 跟 timesteps 有關控制的可能是跟 denoise 有關,一直使用方便的 API 跟簡單的工具,反而不知道直接用用 torch 怎麼給參數了,日後若有研究再補上。

batch_size = 2
caption = "Cinematic photo of an anthropomorphic penguin sitting in a cafe reading a book and having a coffee"
height, width = 1024, 1024
stage_c_latent_shape, stage_b_latent_shape = calculate_latent_sizes(height, width, batch_size=batch_size)

# Stage C Parameters
extras.sampling_configs['cfg'] = 4
extras.sampling_configs['shift'] = 2
extras.sampling_configs['timesteps'] = 20
extras.sampling_configs['t_start'] = 1.0

# Stage B Parameters
extras_b.sampling_configs['cfg'] = 1.1
extras_b.sampling_configs['shift'] = 1
extras_b.sampling_configs['timesteps'] = 10
extras_b.sampling_configs['t_start'] = 1.0

# PREPARE CONDITIONS
batch = {'captions': [caption] * batch_size}
conditions = core.get_conditions(batch, models, extras, is_eval=True, is_unconditional=False, eval_image_embeds=False)
unconditions = core.get_conditions(batch, models, extras, is_eval=True, is_unconditional=True, eval_image_embeds=False)
conditions_b = core_b.get_conditions(batch, models_b, extras_b, is_eval=True, is_unconditional=False)
unconditions_b = core_b.get_conditions(batch, models_b, extras_b, is_eval=True, is_unconditional=True)

with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.float32):
    # torch.manual_seed(42)

    sampling_c = extras.gdf.sample(
        models.generator, conditions, stage_c_latent_shape,
        unconditions, device=device, **extras.sampling_configs,
    )
    for (sampled_c, _, _) in tqdm(sampling_c, total=extras.sampling_configs['timesteps']):
        sampled_c = sampled_c

    # preview_c = models.previewer(sampled_c).float()
    # show_images(preview_c)

    conditions_b['effnet'] = sampled_c
    unconditions_b['effnet'] = torch.zeros_like(sampled_c)

    sampling_b = extras_b.gdf.sample(
        models_b.generator, conditions_b, stage_b_latent_shape,
        unconditions_b, device=device, **extras_b.sampling_configs
    )
    for (sampled_b, _, _) in tqdm(sampling_b, total=extras_b.sampling_configs['timesteps']):
        sampled_b = sampled_b
    sampled = models_b.stage_a.decode(sampled_b).float()

show_images(sampled)

偶然發現一件事,把 cfg 調成 8 後影像變這樣,之前用 comfyUI 會有奇怪的紋理應該就是這造成的。
https://ithelp.ithome.com.tw/upload/images/20240903/20168318DORVJDRp5P.png

調回原本範例給的數值結果會是這樣,好看多了,所以得給 Stable Cascade 更多自主性? 或是拉高 cfg 時得將 Prompt 寫的更加詳細?
https://ithelp.ithome.com.tw/upload/images/20240903/20168318IIzoDUMReF.png

他還有一個變換功能,可以給一張圖,他會基於這張影像略為變換。
範例給的連結失效了,我改拿 huggingface 的測試用圖。如果也失效的話就把下面那張三個太空人的圖下載下來用吧。

batch_size = 4
url = "https://cdn-lfs.huggingface.co/datasets/huggingface/documentation-images/42f0ee242d8caaee1aea5506c8318c6a920d559a63c6db8d79f993eebaf7d790?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27img2img-init.png%3B+filename%3D%22img2img-init.png%22%3B&response-content-type=image%2Fpng&Expires=1725463863&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNTQ2Mzg2M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9kYXRhc2V0cy9odWdnaW5nZmFjZS9kb2N1bWVudGF0aW9uLWltYWdlcy80MmYwZWUyNDJkOGNhYWVlMWFlYTU1MDZjODMxOGM2YTkyMGQ1NTlhNjNjNmRiOGQ3OWY5OTNlZWJhZjdkNzkwP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiZyZXNwb25zZS1jb250ZW50LXR5cGU9KiJ9XX0_&Signature=Ft9aKO6umwkVeAdsSCdlYp7o9IFoAw%7EsBHxAkoOOQxJP89dhP8HRo3WNBU64Gj9UR65AwEL9FUfIWZ8Frr3DdyMqrFqUOIZjknuAxx7YJjnW4KaPtFkjaeeFFEiX4xVoI8WrtJwjTCnexu2acxgcQI469pBPgaasTNxpPKbVNOtmUHQm4%7EaLriuhoHecIkbWhNSCKL56B4gJFNCwgdNn4%7EU4NJZyNN3-GkHzCRnnL3pglp2mGRCtzdqXWcMsy5Ajo9tRCf5xrRdb4CwFEaZQ%7EhOboJPCasgS5pNfKtMR1i37xUCASPddsu2cA-AOjld9Pm8cdmvhZhEu%7EvwQyNAgNA__&Key-Pair-Id=K3ESJI6DHPFC7"
images = resize_image(download_image(url)).unsqueeze(0).expand(batch_size, -1, -1, -1).to(device)

batch = {'images': images}
show_images(batch['images'])

https://ithelp.ithome.com.tw/upload/images/20240903/20168318tTEVdsFwHs.png

接下來測試看看吧。

caption = ""
height, width = 1024, 1024
stage_c_latent_shape, stage_b_latent_shape = calculate_latent_sizes(height, width, batch_size=batch_size)

# Stage C Parameters
extras.sampling_configs['cfg'] = 4
extras.sampling_configs['shift'] = 2
extras.sampling_configs['timesteps'] = 20
extras.sampling_configs['t_start'] = 1.0

# Stage B Parameters
extras_b.sampling_configs['cfg'] = 1.1
extras_b.sampling_configs['shift'] = 1
extras_b.sampling_configs['timesteps'] = 10
extras_b.sampling_configs['t_start'] = 1.0

# PREPARE CONDITIONS
batch['captions'] = [caption] * batch_size

with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
    # torch.manual_seed(42)
    conditions = core.get_conditions(batch, models, extras, is_eval=True, is_unconditional=False, eval_image_embeds=True)
    unconditions = core.get_conditions(batch, models, extras, is_eval=True, is_unconditional=True, eval_image_embeds=False)
    conditions_b = core_b.get_conditions(batch, models_b, extras_b, is_eval=True, is_unconditional=False)
    unconditions_b = core_b.get_conditions(batch, models_b, extras_b, is_eval=True, is_unconditional=True)

    sampling_c = extras.gdf.sample(
        models.generator, conditions, stage_c_latent_shape,
        unconditions, device=device, **extras.sampling_configs,
    )
    for (sampled_c, _, _) in tqdm(sampling_c, total=extras.sampling_configs['timesteps']):
        sampled_c = sampled_c

    preview_c = models.previewer(sampled_c).float()
    show_images(preview_c)

    conditions_b['effnet'] = sampled_c
    unconditions_b['effnet'] = torch.zeros_like(sampled_c)

    sampling_b = extras_b.gdf.sample(
        models_b.generator, conditions_b, stage_b_latent_shape,
        unconditions_b, device=device, **extras_b.sampling_configs
    )
    for (sampled_b, _, _) in tqdm(sampling_b, total=extras_b.sampling_configs['timesteps']):
        sampled_b = sampled_b
    sampled = models_b.stage_a.decode(sampled_b).float()

show_images(sampled)

成品長這樣:
https://ithelp.ithome.com.tw/upload/images/20240903/20168318O0dnhovhHd.png

最後是圖生圖,我一樣用太空人的影像。

batch_size = 2
url = "https://cdn-lfs.huggingface.co/datasets/huggingface/documentation-images/42f0ee242d8caaee1aea5506c8318c6a920d559a63c6db8d79f993eebaf7d790?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27img2img-init.png%3B+filename%3D%22img2img-init.png%22%3B&response-content-type=image%2Fpng&Expires=1725463863&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNTQ2Mzg2M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9kYXRhc2V0cy9odWdnaW5nZmFjZS9kb2N1bWVudGF0aW9uLWltYWdlcy80MmYwZWUyNDJkOGNhYWVlMWFlYTU1MDZjODMxOGM2YTkyMGQ1NTlhNjNjNmRiOGQ3OWY5OTNlZWJhZjdkNzkwP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiZyZXNwb25zZS1jb250ZW50LXR5cGU9KiJ9XX0_&Signature=Ft9aKO6umwkVeAdsSCdlYp7o9IFoAw%7EsBHxAkoOOQxJP89dhP8HRo3WNBU64Gj9UR65AwEL9FUfIWZ8Frr3DdyMqrFqUOIZjknuAxx7YJjnW4KaPtFkjaeeFFEiX4xVoI8WrtJwjTCnexu2acxgcQI469pBPgaasTNxpPKbVNOtmUHQm4%7EaLriuhoHecIkbWhNSCKL56B4gJFNCwgdNn4%7EU4NJZyNN3-GkHzCRnnL3pglp2mGRCtzdqXWcMsy5Ajo9tRCf5xrRdb4CwFEaZQ%7EhOboJPCasgS5pNfKtMR1i37xUCASPddsu2cA-AOjld9Pm8cdmvhZhEu%7EvwQyNAgNA__&Key-Pair-Id=K3ESJI6DHPFC7"

images = resize_image(download_image(url)).unsqueeze(0).expand(batch_size, -1, -1, -1).to(device)

batch = {'images': images}

show_images(batch['images'])

稍微改掉原本範例的 Prompt,畢竟原本的連結也失效了,我換一個比較符合測試圖的。

caption = "three cowboy"
noise_level = 0.8
height, width = 1024, 1024
stage_c_latent_shape, stage_b_latent_shape = calculate_latent_sizes(height, width, batch_size=batch_size)

effnet_latents = core.encode_latents(batch, models, extras)
t = torch.ones(effnet_latents.size(0), device=device) * noise_level
noised = extras.gdf.diffuse(effnet_latents, t=t)[0]

# Stage C Parameters
extras.sampling_configs['cfg'] = 4
extras.sampling_configs['shift'] = 2
extras.sampling_configs['timesteps'] = int(20 * noise_level)
extras.sampling_configs['t_start'] = noise_level
extras.sampling_configs['x_init'] = noised

# Stage B Parameters
extras_b.sampling_configs['cfg'] = 1.1
extras_b.sampling_configs['shift'] = 1
extras_b.sampling_configs['timesteps'] = 10
extras_b.sampling_configs['t_start'] = 1.0

# PREPARE CONDITIONS
batch['captions'] = [caption] * batch_size

with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
    # torch.manual_seed(42)
    conditions = core.get_conditions(batch, models, extras, is_eval=True, is_unconditional=False, eval_image_embeds=False)
    unconditions = core.get_conditions(batch, models, extras, is_eval=True, is_unconditional=True, eval_image_embeds=False)
    conditions_b = core_b.get_conditions(batch, models_b, extras_b, is_eval=True, is_unconditional=False)
    unconditions_b = core_b.get_conditions(batch, models_b, extras_b, is_eval=True, is_unconditional=True)

    sampling_c = extras.gdf.sample(
        models.generator, conditions, stage_c_latent_shape,
        unconditions, device=device, **extras.sampling_configs,
    )
    for (sampled_c, _, _) in tqdm(sampling_c, total=extras.sampling_configs['timesteps']):
        sampled_c = sampled_c

    preview_c = models.previewer(sampled_c).float()
    show_images(preview_c)

    conditions_b['effnet'] = sampled_c
    unconditions_b['effnet'] = torch.zeros_like(sampled_c)

    sampling_b = extras_b.gdf.sample(
        models_b.generator, conditions_b, stage_b_latent_shape,
        unconditions_b, device=device, **extras_b.sampling_configs
    )
    for (sampled_b, _, _) in tqdm(sampling_b, total=extras_b.sampling_configs['timesteps']):
        sampled_b = sampled_b
    sampled = models_b.stage_a.decode(sampled_b).float()

show_images(batch['images'])
show_images(sampled)

成品:
https://ithelp.ithome.com.tw/upload/images/20240903/20168318F11ToNkhXy.png

為了省資源,我用的是 small-small 版本,大模型的精細度應該會更好。


上一篇
Day 20: Stable Cascade 架設-3
下一篇
Day 22: Cursor
系列文
AI 學習紀錄30
圖片
  直播研討會
圖片
{{ item.channelVendor }} {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言